## Fine-tuning with QLoRA

In [1]:
# load data
from datasets import load_dataset

# load data
train = load_dataset("json", 
                            data_files=["data/my-conll2003-dataset-train_sample.jsonl"])
                            
validation = load_dataset("json", 
                            data_files=["data/my-conll2003-dataset-val_sample.jsonl"])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_labels', 'sentence', 'entities'],
        num_rows: 1000
    })
})

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

In [4]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")



ValueError: 
                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
                    in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to
                    `from_pretrained`. Check
                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                    for more details.
                    

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                        padding_side="left") # padding on left from brewdev notebook
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def formatting_func(entry):
    #tokenizer add bos token
    text = (f"[INST] You are an NLP expert tasked with Named Entity Extraction. "
        f"Identify entities of the type Person (PER), Organization (ORG), Location (LOC) and Miscellaneous (MISC) in the following sentence: '{entry['sentence']}'\n"
        "Your answer must be in the form of a dict {'PER':['person entity 1', 'person entity 2', '...'], 'ORG': [], 'LOC': [], 'MISC': []} \n"
        "Take care, your answer is only valid if it follows the correct format! [/INST]\n"
        f"{entry['entities']}</s>")

    return text

In [8]:
max_length = 512 # This was an appropriate max length for my dataset

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
tokenized_train_dataset = train.map(generate_and_tokenize_prompt)
tokenized_val_dataset = validation.map(generate_and_tokenize_prompt)



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [00:00<00:00, 1866.97 examples/s]


In [10]:
print(tokenizer.decode(tokenized_train_dataset['train'][0]['input_ids']))

</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>

In [11]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

NameError: name 'model' is not defined

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
model = get_peft_model(model, config)

In [None]:
print_trainable_parameters(model)

trainable params: 41943040 || all params: 3794014208 || trainable%: 1.1055056122762943


In [None]:
!pip install -q wandb -U

import wandb, os
wandb.login()

wandb_project = "ner-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.5/258.5 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:

project = "ner-finetune"
base_model_name = "mistral7b"
run_name = base_model_name + "-" + project+"-"+ "run2"
output_dir = "drive/MyDrive/ner-finetuning/runs/" + run_name

In [None]:
!ls drive/MyDrive/ner-finetuning/

checkpoint-450	checkpoint-450-20240301T110956Z-001.zip  test_sample_basepreds.json


In [None]:
output_dir

'drive/MyDrive/ner-finetuning/runs/mistral7b-ner-finetune-run2'

In [None]:
from datetime import datetime
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=500,
        learning_rate=2.5e-5, # Want a small lr for finetuning
        fp16=True,
        logging_steps=25,
        optim="paged_adamw_8bit",
        logging_dir="drive/MyDrive/ner-finetuning/runs/logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=25,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


[34m[1mwandb[0m: Currently logged in as: [33maareias[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
25,1.5936,0.675418
50,0.4751,0.474684
75,0.4196,0.442876
100,0.4598,0.407857
125,0.4279,0.390319
150,0.3899,0.373656
175,0.3422,0.363928
200,0.3418,0.35969
225,0.3069,0.355298
250,0.357,0.352663




Step,Training Loss,Validation Loss
25,1.5936,0.675418
50,0.4751,0.474684
75,0.4196,0.442876
100,0.4598,0.407857
125,0.4279,0.390319
150,0.3899,0.373656
175,0.3422,0.363928
200,0.3418,0.35969
225,0.3069,0.355298
250,0.357,0.352663




TrainOutput(global_step=500, training_loss=0.41609223747253415, metrics={'train_runtime': 4706.3189, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.106, 'total_flos': 2.1972796833792e+16, 'train_loss': 0.41609223747253415, 'epoch': 1.0})