### References
- Finetune Llama 3 for Sentiment Analysis (https://www.kaggle.com/code/lucamassaron/fine-tune-llama-3-for-sentiment-analysis)
- Finetune Llama 2 for Sentiment Analysis (https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis)

In [1]:
!nvidia-smi

Thu Oct 24 12:24:06 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P0             25W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Libraries

In [2]:
# %pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
# %pip install -q -U -i https://pypi.org/simple/ bitsandbytes
%pip install -q -U bitsandbytes
%pip install -q -U transformers
%pip install -q -U accelerate
%pip install -q -U datasets
%pip install -q -U trl
%pip install -q -U peft
# %pip install -q -U tensorboard

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
import bitsandbytes as bnb
import wandb
from datasets import load_dataset, Dataset
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, pipeline, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer
from sklearn.metrics import f1_score

# Config

In [4]:
seed = 42
lang = 'eng'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Might not work on Kaggle
model_id = 'meta-llama/Llama-3.2-1B-Instruct'
project_name = 'Llama-3.2-1B-Instruct-Emotion-eng'
hub_model_id = f'alxxtexxr/{project_name}'

Disabling two features in PyTorch related to memory efficiency and speed during operations on the Graphics Processing Unit (GPU) specifically for the scaled dot product attention (SDPA) function.

In [5]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [6]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


In [7]:
# print("Hugging Face token (https://huggingface.co/settings/tokens):")
# if not os.path.exists('/root/.cache/huggingface/token'):
#     hf_token = input()
#     !huggingface-cli login --token $hf_token
# else:
#     print("Hugging Face token has already been saved")

Hugging Face token (https://huggingface.co/settings/tokens):


  pid, fd = os.forkpty()


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [23]:
#  Login with your authentication key
wandb.login()

# setup wandb environment variables
%env WANDB_ENTITY=alimtegar
%env WANDB_PROJECT=Llama-3.2-1B-Instruct-Emotion-eng

env: WANDB_ENTITY=alimtegar
env: WANDB_PROJECT=Llama-3.2-1B-Instruct-Emotion-eng


# Data

## Load Data

In [9]:
data_files = {
    'train': f'preprocessed_data/train/{lang}.csv', 
    'val': f'preprocessed_data/val/{lang}.csv',
    'test': f'preprocessed_data/test/{lang}.csv',
}
dataset = load_dataset('alxxtexxr/SemEval2025-Task11-Dataset', data_files=data_files)

splits = data_files.keys()
df = {split: pd.DataFrame(dataset[split]) for split in splits}

cols = list(df['train'].columns)
print("DF columns:", cols)

emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
# neutral_emotion = df['train'][df['train'][emotion_cols].sum(axis=1) == 0]['emotion'].iloc[0]
# emotions = emotion_cols + [neutral_emotion]
print("Emotions columns:", emotion_cols)
print()

print("Train DF size:", len(df['train']))
print("Validation DF size:", len(df['val']))
print("Testing DF size:", len(df['test']))

preprocessed_data/train/eng.csv:   0%|          | 0.00/236k [00:00<?, ?B/s]

preprocessed_data/val/eng.csv:   0%|          | 0.00/57.3k [00:00<?, ?B/s]

preprocessed_data/test/eng.csv:   0%|          | 0.00/9.55k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DF columns: ['Unnamed: 0', 'text', 'emotion', 'anger', 'fear', 'joy', 'sad', 'surprise']
Emotions columns: ['anger', 'fear', 'joy', 'sad', 'surprise']

Train DF size: 2214
Validation DF size: 554
Testing DF size: 116


## Create One-Hot Emotion Data

In [10]:
# df['train']['one_hot_emotion'] = df['train'].apply(lambda row: row[emotion_cols].tolist(), axis=1).tolist()
# df['val']['one_hot_emotion'] = df['val'].apply(lambda row: row[emotion_cols].tolist(), axis=1).tolist()

# df['val']['one_hot_emotion']

## Create Prompt Data

In [11]:
prompt_template = """### Instruction
Detect the emotion(s) in the given input text. 
The detected emotion(s) can be one or a combination of the following: anger, fear, joy, sad, surprise, or neutral

### Input
Text: {text}

### Output
Emotion(s): {emotion}"""

def create_prompt(row):
    emotion_list = row['emotion'].replace(" ", "").split(",")
    emotion = ", ".join([f"{e}" for e in row['emotion'].replace(" ", "").split(",")])#[1:]#+ "]"
    # emotion = '\n'.join([f"- {e}" for e in emotion_list])[2:]
    return prompt_template.format(text=row['text'], emotion=emotion).strip()

def create_test_prompt(row):
    return prompt_template.format(text=row['text'], emotion="").strip()

df['train']['prompt'] = df['train'].apply(create_prompt, axis=1)
df['val']['prompt'] = df['val'].apply(create_test_prompt, axis=1)
df['test']['prompt'] = df['test'].apply(create_test_prompt, axis=1)

print("Train prompts:\n")
for prompt in df['train']['prompt'].head(3):
    print(prompt)
    print("================================================================================================================================================================================================")
print()
print("Testing prompts:\n")
for prompt in df['test']['prompt'].head(3):
    print(prompt)
    print("================================================================================================================================================================================================")

Train prompts:

### Instruction
Detect the emotion(s) in the given input text. 
The detected emotion(s) can be one or a combination of the following: anger, fear, joy, sad, surprise, or neutral

### Input
Text: I now have 12 of those canker sore suckers in my mouth along with a fever since friday.

### Output
Emotion(s): fear, sad
### Instruction
Detect the emotion(s) in the given input text. 
The detected emotion(s) can be one or a combination of the following: anger, fear, joy, sad, surprise, or neutral

### Input
Text: It just... went away.

### Output
Emotion(s): fear, sad, surprise
### Instruction
Detect the emotion(s) in the given input text. 
The detected emotion(s) can be one or a combination of the following: anger, fear, joy, sad, surprise, or neutral

### Input
Text: I naively walked up and stuck my head in the driver's window hole.

### Output
Emotion(s): fear, surprise

Testing prompts:

### Instruction
Detect the emotion(s) in the given input text. 
The detected emotion(s

In [12]:
max_seq_lengths = {split: df[split]['prompt'].str.len().max() for split in splits}
max_seq_length = int(max(max_seq_lengths.values()))

print("Train max. prompt length:", max_seq_lengths['train'])
print("Validation max. prompt length:", max_seq_lengths['val'])
print("Testing max. prompt length:", max_seq_lengths['test'])
print()
print("Max. prompt length:", max_seq_length, f"({type(max_seq_length)})")

Train max. prompt length: 673
Validation max. prompt length: 586
Testing max. prompt length: 527

Max. prompt length: 673 (<class 'int'>)


## Create Hugging Face Datasets

In [13]:
datasets = {split: Dataset.from_pandas(df[split][['prompt']]) for split in ['train', 'val']}
datasets

{'train': Dataset({
     features: ['prompt'],
     num_rows: 2214
 }),
 'val': Dataset({
     features: ['prompt'],
     num_rows: 554
 })}

# Model

In [14]:
compute_dtype = torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_id, max_seq_length=max_seq_length)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [16]:
# max_new_tokens = df['train'].apply(lambda row: len(tokenizer.encode(row['emotion'])), axis=1).max()
# print("Max. emotion tokens:", max_new_tokens)

# Evaluation without Finetuning 

In [17]:
y_true = df['val'].apply(lambda row: row[emotion_cols].tolist(), axis=1).tolist()
print(f"True Y ({len(y_true)}):")
y_true[:10]

True Y (554):


[[0, 1, 0, 1, 0],
 [0, 1, 0, 0, 1],
 [0, 0, 0, 1, 1],
 [0, 0, 0, 1, 0],
 [1, 1, 0, 0, 1],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 1],
 [0, 1, 0, 1, 1],
 [1, 1, 0, 0, 0]]

In [18]:
def one_hot_encode_emotion(emotion, emotion_cols):
    emotions = emotion.replace(" ", "").split(",")
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

def predict(df_, model, tokenizer, max_new_tokens=32, batch_size=128):
    prompt = df_['prompt'].tolist()
    pipe = pipeline(
        task='text-generation',
        model=model, 
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=0.001,
        batch_size=batch_size,
    )
    outputs = pipe(prompt)
    pred_emotion_list = [output[0]['generated_text'].split("Emotion(s): ")[-1].split("\n")[0].lower() for output in outputs]
    y_pred = [one_hot_encode_emotion(pred_emotion_i, emotion_cols) for pred_emotion_i in pred_emotion_list]
    return y_pred

y_pred = predict(df['val'], model, tokenizer)
print(f"Predicted Y ({len(y_pred)}):")
y_pred[:10]

Predicted Y (554):


[[0, 1, 1, 0, 0],
 [0, 1, 1, 0, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 1, 0, 0],
 [1, 1, 0, 0, 0],
 [1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 1, 1, 1, 1]]

In [19]:
# Compute F1 score for each type of averaging method
f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0.0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0.0)
# f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0.0)
# f1_samples = f1_score(y_true, y_pred, average='samples', zero_division=0.0)
f1_per_label = f1_score(y_true, y_pred, average=None, zero_division=0.0)


print(f'F1 Score (Micro-Average): {f1_micro}')
print(f'F1 Score (Macro-Average): {f1_macro}')
print()
for label, f1 in zip(emotion_cols, f1_per_label):
    print(f"F1 Score for '{label}': {f1}")

F1 Score (Micro-Average): 0.49611398963730574
F1 Score (Macro-Average): 0.4986404737915498

F1 Score for 'anger': 0.5524861878453039
F1 Score for 'fear': 0.5146443514644352
F1 Score for 'joy': 0.5403508771929825
F1 Score for 'sad': 0.5847176079734219
F1 Score for 'surprise': 0.30100334448160543


# Finetuning

In [20]:
stop

NameError: name 'stop' is not defined

In [21]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

target_modules = find_all_linear_names(model)
print("Target LoRA modules:", target_modules)

Target LoRA modules: ['up_proj', 'o_proj', 'q_proj', 'k_proj', 'gate_proj', 'v_proj', 'down_proj']


In [24]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.0,
    r=64,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=target_modules,
)

resume_from_checkpoint = 'checkpoint-830'
resume_from_lr = 3.187593272453288e-05

train_args = TrainingArguments(
    num_train_epochs=5,            # number of training epochs
    per_device_train_batch_size=1, # batch size per device during training
    gradient_accumulation_steps=8, # number of steps before performing a backward/update pass
    gradient_checkpointing=True,   # use gradient checkpointing to save memory
    optim='paged_adamw_32bit',
    # save_steps=0,
    logging_steps=20,                         
    learning_rate=2e-4 if not resume_from_lr else resume_from_lr, # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,             # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03 if not resume_from_checkpoint else 0.0, # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type='cosine',    # use cosine learning rate scheduler
    report_to='wandb',             # report metrics to w&b
    # eval_strategy="steps",       # save checkpoint every epoch
    # eval_steps = 20.

    # Arguments for saving the training
    output_dir=project_name,       # directory to save and repository id
    save_strategy='epoch',         # save at each epoch
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy='all_checkpoints',

    # Arguments for resuming the training
    resume_from_checkpoint=resume_from_checkpoint
)

tokenizer.padding_side = 'right'

trainer = SFTTrainer(
    model=model,
    args=train_args,
    train_dataset=datasets['train'],
    # eval_dataset==datasets['val'],
    peft_config=peft_config,
    dataset_text_field='prompt',
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    packing=False,
    dataset_kwargs={
        'add_special_tokens': False,
        'append_concat_token': False,
    },
    # compute_metrics=compute_metrics,
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2214 [00:00<?, ? examples/s]

  super().__init__(


In [25]:
trainer.train()
wandb.finish()



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113777666668687, max=1.0…

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
20,2.2098
40,1.3032
60,1.1187
80,1.0104
100,1.072
120,1.0076
140,0.9701
160,0.891
180,0.9233
200,0.8788


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=1380, training_loss=0.8942732499993366, metrics={'train_runtime': 3038.7168, 'train_samples_per_second': 3.643, 'train_steps_per_second': 0.454, 'total_flos': 5118029326725120.0, 'train_loss': 0.8942732499993366, 'epoch': 4.9864498644986455})

# Evaluation After Finetuning

In [34]:
model.config.use_cache = True
tokenizer.padding_side = 'left'

y_pred_ft = predict(df['val'], model, tokenizer, batch_size=64)
print(f"Predicted Y (FT) ({len(y_pred_ft)}):")
y_pred_ft[:10]

  return fn(*args, **kwargs)


Predicted Y (FT) (554):


[[0, 1, 0, 1, 1],
 [0, 1, 0, 1, 1],
 [1, 1, 0, 1, 1],
 [0, 1, 0, 1, 1],
 [1, 1, 0, 1, 1],
 [0, 1, 0, 1, 1],
 [0, 1, 0, 1, 1],
 [1, 1, 0, 1, 1],
 [1, 1, 0, 1, 1],
 [1, 1, 0, 1, 1]]

In [37]:
# Compute F1 score for each type of averaging method
f1_micro = f1_score(y_true, y_pred_ft, average='micro', zero_division=0.0)
f1_macro = f1_score(y_true, y_pred_ft, average='macro', zero_division=0.0)
# f1_weighted = f1_score(y_true, y_pred_ft, average='weighted', zero_division=0.0)
# f1_samples = f1_score(y_true, y_pred_ft, average='samples', zero_division=0.0)
f1_per_label = f1_score(y_true, y_pred_ft, average=None, zero_division=0.0)


print(f'F1 Score (Micro-Average): {f1_micro}')
print(f'F1 Score (Macro-Average): {f1_macro}')
print()
for label, f1 in zip(emotion_cols, f1_per_label):
    print(f"F1 Score for '{label}': {f1}")

F1 Score (Micro-Average): 0.6013719512195123
F1 Score (Macro-Average): 0.6124657330409786

F1 Score for 'anger': 0.638888888888889
F1 Score for 'fear': 0.8198074277854195
F1 Score for 'joy': 0.6586102719033232
F1 Score for 'sad': 0.48739495798319327
F1 Score for 'surprise': 0.45762711864406774
