In [1]:
#!pip install datasets evaluate accelerate bitsandbytes peft
#!unzip airline_sentimentss.zip -d sentiments

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    TrainerCallback,
)
from datasets import load_dataset
import evaluate
import torch
print(torch.__version__)
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

In [4]:
# this is different than the guide can be removed
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("/model-weights/Llama-2-7b-hf",
                                                           num_labels = 3,
                                                           label2id=label2id,
                                                           id2label=id2label,
                                                           # this is different than the guide can be removed
                                                           quantization_config=bnb_config,)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /model-weights/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
device = "cuda"

In [7]:
tokenizer = AutoTokenizer.from_pretrained("/model-weights/Llama-2-7b-hf", return_tensors="pt",
                                          # this is different than the guide
                                          model_max_length=150)

In [8]:
tokenizer.pad_token = tokenizer.eos_token


In [9]:
orig_dataset = load_dataset("parquet", data_files="./data_balanced.parquet")

In [10]:
orig_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Lable'],
        num_rows: 9093
    })
})

In [11]:
dataset = orig_dataset.rename_column("Lable", "label").rename_column("Text", "text")

In [12]:
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8183
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 910
    })
})

In [14]:
def preprocess_fn(examples):
  return tokenizer(examples["text"], truncation=True)

In [15]:
toked = dataset.map(preprocess_fn, batched=True)

In [16]:
# this is different than the guide

def change_labels(example):
  label = example["label"]
  n_label = 0 if label == "negative" else 1 if label == "neutral" else 2
  example["label"] = n_label
  return example

In [17]:
# this is different than the guide
toked = toked.map(change_labels)

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
accuracy = evaluate.load("accuracy")

In [20]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
# this is different than the guide (llama is a bit different, should define pad token)
model.config.pad_token_id = model.config.eos_token_id
model.config.use_cache = False 

In [22]:
# this is different than the guide can be removed
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=['v_proj', 'down_proj', 'up_proj', 'q_proj', 'gate_proj', 'k_proj', 'o_proj'],
)
#model.add_adapter(peft_config) can be removed
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In case of problem, check [this link](https://discuss.huggingface.co/t/llama-2-sequence-classification-much-lower-accuracy-on-inference-from-checkpoint-compared-to-model/54910/2) out.

In [23]:
class SaveScoreCallback(TrainerCallback):  
    def __init__(self, model) -> None:
        super().__init__()
        self.model = model

    def on_save(self, 
                args, 
                state,
                control,
                **kwargs ):
        fname = f"{args.output_dir}/checkpoint-{state.global_step}/score.original_module.pt"
        torch.save(model.model.score.original_module.state_dict(), fname)

In [24]:
training_args = TrainingArguments(
    output_dir="tweet_chield_normal_trained_model",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=toked["train"],
    eval_dataset=toked["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.add_callback(SaveScoreCallback(model)) 

trainer.train()

trainer.save_model()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 