In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install unsloth

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 6.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [45]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


# Data prep

In [3]:
from datasets import load_dataset

absa_quad = load_dataset("NEUDM/absa-quad")

README.md:   0%|          | 0.00/2.83k [00:00<?, ?B/s]

Generation/train.jsonl:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Generation/dev.jsonl:   0%|          | 0.00/503k [00:00<?, ?B/s]

Generation/test.jsonl:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2098 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/525 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1081 [00:00<?, ? examples/s]

In [13]:
import ast
import json

In [29]:
EOS_TOKEN = tokenizer.eos_token

In [46]:
llama3_train_dataset = []


for example in absa_quad["train"].iter(batch_size=1):
    # get the raw text
    example_text = example["input"][0]
    #print(example_text)

    input_text = ast.literal_eval(example_text)[0]
    
    gold_labels = example["output"][0]
    
    #print(gold_labels)
    golds = ast.literal_eval(gold_labels)
    tmp_list_of_quads = []
    for quad in golds:
        #print(quad)
        tmp_d = {}
        tmp_d["opinion term"] = quad[0]
        tmp_d["aspect category"] = quad[1]
        tmp_d["sentiment"] = quad[2]
        tmp_d["justification"] = quad[3]
        
        # convert to json representation for making the prompt
        
        tmp_list_of_quads.append(json.dumps(tmp_d))
    
    # create the string representation of the gold answer
    # UNSLOTH SAYS ADD EOS_TOKEN MANUALLY
    gold_answer = '{\"aspect_based_sentiment_analysis\": ' + '[' + ', '.join(tmp_list_of_quads) + ']' + ' }' + EOS_TOKEN
    
    llama3template = f"""Below is a DOCUMENT in which human beings may be expressing themselves about products or services.

Perform a full aspect-based sentiment analysis of the DOCUMENT.
    
Only use sentiment labels that appear in the below list of ALLOWED SENTIMENTS.
    
Only use aspect category labels that appear in the below list of ALLOWED ASPECT CATEGORIES.
    
Use the exact words and spelling found in the DOCUMENT without modification.

Return your answer as a structured JSON object without deviation.

### DOCUMENT:

{input_text}

### ALLOWED SENTIMENTS:

- positive
- negative
- neutral

### ALLOWED ASPECT CATEGORIES:

- food quality
- service general
- restaurant general
- ambience general
- food style_options
- restaurant miscellaneous
- food prices
- restaurant prices
- drinks quality
- drinks style_options
- location general
- drinks prices
- food general

### RESPONSE:

{gold_answer}"""
    
    llama3_train_dataset.append(llama3template)

In [47]:
print(llama3_train_dataset[0])

Below is a DOCUMENT in which human beings may be expressing themselves about products or services.

Perform a full aspect-based sentiment analysis of the DOCUMENT.
    
Only use sentiment labels that appear in the below list of ALLOWED SENTIMENTS.
    
Only use aspect category labels that appear in the below list of ALLOWED ASPECT CATEGORIES.
    
Use the exact words and spelling found in the DOCUMENT without modification.

Return your answer as a structured JSON object without deviation.

### DOCUMENT:

The wait here is long for dim sum , but if you do n't like sharing tables or if the typical raucous dim sum atmosphere is not your gig , this is a sleek ( for Chinatown ) alternative .

### ALLOWED SENTIMENTS:

- positive
- negative
- neutral

### ALLOWED ASPECT CATEGORIES:

- food quality
- service general
- restaurant general
- ambience general
- food style_options
- restaurant miscellaneous
- food prices
- restaurant prices
- drinks quality
- drinks style_options
- location general


In [48]:
import pandas as pd
from datasets import Dataset

# convert to HF dataset
df = pd.DataFrame(llama3_train_dataset)
train_data = Dataset.from_pandas(df.rename(columns={0: "text"}), split="train")

In [39]:
df.head(3)

Unnamed: 0,0
0,Below is a DOCUMENT in which human beings may ...
1,Below is a DOCUMENT in which human beings may ...
2,Below is a DOCUMENT in which human beings may ...


In [41]:
train_data["text"][0]

'Below is a DOCUMENT in which human beings may be expressing themselves about products or services.\n\nPerform a full aspect-based sentiment analysis of the DOCUMENT.\n    \nOnly use sentiment labels that appear in the below list of ALLOWED SENTIMENTS.\n    \nOnly use aspect category labels that appear in the below list of ALLOWED ASPECT CATEGORIES.\n    \nUse the exact words and spelling found in the DOCUMENT without modification.\n\nReturn your answer as a structured JSON object without deviation.\n\n### DOCUMENT:\n\nThe wait here is long for dim sum , but if you do n\'t like sharing tables or if the typical raucous dim sum atmosphere is not your gig , this is a sleek ( for Chinatown ) alternative .\n\n### ALLOWED SENTIMENTS:\n\n- positive\n- negative\n- neutral\n\n### ALLOWED ASPECT CATEGORIES:\n\n- food quality\n- service general\n- restaurant general\n- ambience general\n- food style_options\n- restaurant miscellaneous\n- food prices\n- restaurant prices\n- drinks quality\n- drink

# Trying to do the train on completions only

not very clear how this works as of 2024, examples seem old

In [35]:
from trl import DataCollatorForCompletionOnlyLM

In [37]:
# doesnt seem to work
# - was getting 0 loss during training even after 1 step

#response_template = "### RESPONSE:"
#completion_only_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# back to Unsloth main part

- note: trying myself with the `completion_only_collator` stuff added to the `SFTTrainer`

In [49]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    #data_collator=completion_only_collator, # NOTE -- I ADDED THIS FOR THE COMPLETIONS ONLY STUFF
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps = 5,
        max_steps = 60,

        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map (num_proc=2):   0%|          | 0/2098 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [50]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla P100-PCIE-16GB. Max memory = 15.888 GB.
6.877 GB of memory reserved.


# go

lets-a-goooooooo

In [51]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,098 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.2101
2,2.118
3,2.0676
4,1.9069
5,1.7734
6,1.5828
7,1.2227
8,0.922
9,0.7071
10,0.5526


In [52]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1026.745 seconds used for training.
17.11 minutes used for training.
Peak reserved memory = 6.975 GB.
Peak reserved memory for training = 0.098 GB.
Peak reserved memory % of max memory = 43.901 %.
Peak reserved memory for training % of max memory = 0.617 %.


# Inference

In [69]:
inference_llama3template = f"""Below is a DOCUMENT in which human beings may be expressing themselves about products or services.

Perform a full aspect-based sentiment analysis of the DOCUMENT.
    
Only use sentiment labels that appear in the below list of ALLOWED SENTIMENTS.
    
Only use aspect category labels that appear in the below list of ALLOWED ASPECT CATEGORIES.
    
Use the exact words and spelling found in the DOCUMENT without modification.

Return your answer as a structured JSON object without deviation.

### DOCUMENT:

{query}

### ALLOWED SENTIMENTS:

- positive
- negative
- neutral

### ALLOWED ASPECT CATEGORIES:

- food quality
- service general
- restaurant general
- ambience general
- food style_options
- restaurant miscellaneous
- food prices
- restaurant prices
- drinks quality
- drinks style_options
- location general
- drinks prices
- food general

### RESPONSE:
"""

In [70]:
query = "I hated this place's burgers they are the worst i have ever tasted!"

# DEBUG TODO


- i don't understand : if i use tokenizer(prompt_temoplate.format(document=current_query) or whatever, it seems to never update and use old values/prompts ??!?! that's why i'm manually "writing" the prompt below

**UPDATE: i think it's because i was using a f string for the prompt so that variable name within the { } was actually looking earlier in notebook - need to skip the f string if use """ """ and then format later**

In [74]:
# NOTE -- after debugging: skip the f string here, so variables don't need to be defined
# (i think before i must have had a variable named query that was going in to my prompt as a F-string, BEFORE the call to .format)
formatted_prompt = """Below is a DOCUMENT in which human beings may be expressing themselves about products or services.

Perform a full aspect-based sentiment analysis of the DOCUMENT.
    
Only use sentiment labels that appear in the below list of ALLOWED SENTIMENTS.
    
Only use aspect category labels that appear in the below list of ALLOWED ASPECT CATEGORIES.
    
Use the exact words and spelling found in the DOCUMENT without modification.

Return your answer as a structured JSON object without deviation.

### DOCUMENT:

{manual_add_query}

### ALLOWED SENTIMENTS:

- positive
- negative
- neutral

### ALLOWED ASPECT CATEGORIES:

- food quality
- service general
- restaurant general
- ambience general
- food style_options
- restaurant miscellaneous
- food prices
- restaurant prices
- drinks quality
- drinks style_options
- location general
- drinks prices
- food general

### RESPONSE:
"""

In [76]:
current_query="I really hated this place's burgers they are the worst i have ever tasted in New York!"

In [81]:
FastLanguageModel.for_inference(model) # Unsloth has 2x faster inference!

inputs = tokenizer(formatted_prompt.format(manual_add_query=current_query), return_tensors = "pt").to("cuda")

outputs = model.generate(input_ids = inputs.input_ids,
                         attention_mask = inputs.attention_mask,
                         max_new_tokens = 500)

tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is a DOCUMENT in which human beings may be expressing themselves about products or services.\n\nPerform a full aspect-based sentiment analysis of the DOCUMENT.\n    \nOnly use sentiment labels that appear in the below list of ALLOWED SENTIMENTS.\n    \nOnly use aspect category labels that appear in the below list of ALLOWED ASPECT CATEGORIES.\n    \nUse the exact words and spelling found in the DOCUMENT without modification.\n\nReturn your answer as a structured JSON object without deviation.\n\n### DOCUMENT:\n\nI really hated this place\'s burgers they are the worst i have ever tasted in New York!\n\n### ALLOWED SENTIMENTS:\n\n- positive\n- negative\n- neutral\n\n### ALLOWED ASPECT CATEGORIES:\n\n- food quality\n- service general\n- restaurant general\n- ambience general\n- food style_options\n- restaurant miscellaneous\n- food prices\n- restaurant prices\n- drinks quality\n- drinks style_options\n- location general\n- drinks prices\n- food general\n\n### RESP

In [87]:
res = tokenizer.batch_decode(outputs)

# add 0 here - might not work when have several in batch TODO: test with dataset not just 1 sample
start_idx = res[0].index("### RESPONSE:\n")

print(start_idx)

# TODO: fix this better with error handling or if train with chat template understand how it works
generated_absa_text = res[0][start_idx + len("### RESPONSE:\n"):]


951


In [89]:
generated_absa_text = generated_absa_text.rstrip(EOS_TOKEN)

generated_absa_text

'{"aspect_based_sentiment_analysis": [{"opinion term": "burgers", "aspect category": "food quality", "sentiment": "negative", "justification": "worst"}] }'

# try JSON loading

In [90]:
absa_data = json.loads(generated_absa_text)

absa_data["aspect_based_sentiment_analysis"]

[{'opinion term': 'burgers',
  'aspect category': 'food quality',
  'sentiment': 'negative',
  'justification': 'worst'}]

# Saving and GGUF stuff

**NOTICED THAT THIS IS ---NOT--- THE INSTRUCT VERSION O_o**

In [91]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [92]:
SAVE_NAME = "Llama-3.1-8B"

In [93]:
# save LoRA stuff


model.push_to_hub(f"benjaminzwhite/{SAVE_NAME}_ABSQ-AQSP_LoRA") 
tokenizer.push_to_hub(f"benjaminzwhite/{SAVE_NAME}_ABSQ-AQSP_LoRA")

README.md:   0%|          | 0.00/595 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/benjaminzwhite/Llama-3.1-8B_ABSQ-AQSP_LoRA


# Try doing the merging

https://huggingface.co/blog/mlabonne/sft-llama3

**UPDATE : run out of space, will try redownloading the LoRA stuff and do GGUF only**

In [94]:
model.push_to_hub_merged(f"benjaminzwhite/{SAVE_NAME}_ABSQ-AQSP_LoRA_merged_16bit", tokenizer, save_method="merged_16bit")

Unsloth: You are pushing to hub, but you passed your HF username = benjaminzwhite.
We shall truncate benjaminzwhite/Llama-3.1-8B_ABSQ-AQSP_LoRA_merged_16bit to Llama-3.1-8B_ABSQ-AQSP_LoRA_merged_16bit
Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 19.97 out of 31.36 RAM for saving.


 38%|███▊      | 12/32 [00:00<00:00, 25.19it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:24<00:00,  1.33it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving Llama-3.1-8B_ABSQ-AQSP_LoRA_merged_16bit/pytorch_model-00001-of-00004.bin...
Unsloth: Saving Llama-3.1-8B_ABSQ-AQSP_LoRA_merged_16bit/pytorch_model-00002-of-00004.bin...
Unsloth: Saving Llama-3.1-8B_ABSQ-AQSP_LoRA_merged_16bit/pytorch_model-00003-of-00004.bin...


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 2785153024 vs 2785152912

In [95]:
quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]
for quant in quant_methods:
    model.push_to_hub_gguf(f"benjaminzwhite/{SAVE_NAME}_ABSQ-AQSP_GGUF", tokenizer, quant)

fatal: could not create work tree dir 'llama.cpp': No space left on device
make: *** llama.cpp: No such file or directory.  Stop.
make: *** llama.cpp: No such file or directory.  Stop.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 22.65 out of 31.36 RAM for saving.


  9%|▉         | 3/32 [00:00<00:02, 10.71it/s]


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 576 vs 470