## Steps to fine tune model
- Select and load model
- Select and preprocess dataset (train/eval split, tokenize)
- Define quantization and LoRA adaptation before fine-tuning for efficiency
- Tune and evaluate model





In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

from datasets import load_dataset
from peft import PeftModel
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

def initialize_model(hf_modelname, use_quantization=True):
    """Initialize the language model with quantization configuration for QLoRA."""
    if use_quantization:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4", # sets the data type for tuned parameters
            bnb_4bit_compute_dtype=torch.bfloat16 # sets the data type for all parameters
        )
    else: 
        bnb_config = None

    model = AutoModelForCausalLM.from_pretrained(hf_modelname, device_map="auto",
                                                 quantization_config=bnb_config)
    return model

def initialize_tokenizer(hf_modelname):
    """Initialize the tokenizer."""

    tokenizer = AutoTokenizer.from_pretrained(
        hf_modelname,
        padding_side="left",
        add_eos_token=True,
        add_bos_token=True,
        use_fast=False # when True this one uses a Rust-based tokenizer
    )
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def count_trainable_parameters(model):
    """Count the number of trainable parameters in the model."""
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params


In [6]:
hf_modelname = "mistralai/Mistral-7B-Instruct-v0.2"

model = initialize_model(hf_modelname, use_quantization=True)
tokenizer = initialize_tokenizer(hf_modelname)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [7]:
print('GPU:', torch.cuda.is_available())

GPU: True


In [8]:
print('Total parameters:', model.num_parameters())
print('Trainable parameters before LoRA:', count_trainable_parameters(model))

Total parameters: 7241732096
Trainable parameters before LoRA: 262410240


In [10]:
def prepare_datasets(tokenizer, file_path):
    """Load and tokenize the training dataset."""
    dataset = load_dataset(path='text', data_files=file_path, split='train')
    dataset = dataset.train_test_split(train_size=0.9)

    # Shuffle the training dataset
    dataset['train'] = dataset['train'].shuffle()

    generate_and_tokenize_prompt = lambda prompt: tokenizer(prompt['text'])

    tokenized_train_dataset = dataset['train'].map(generate_and_tokenize_prompt)
    tokenized_val_dataset = dataset['test'].map(generate_and_tokenize_prompt)

    return tokenized_train_dataset, tokenized_val_dataset


def max_input_lengths(tokenized_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset] + \
              [len(x['input_ids']) for x in tokenized_val_dataset]

    return max(lengths)

In [12]:
tokenized_train_dataset, tokenized_val_dataset = prepare_datasets(tokenizer, 'data/finetune-emails.txt')
print('Length of longest input', max_input_lengths(tokenized_train_dataset, tokenized_val_dataset))


Length of longest input 201


In [13]:
def setup_peft_model(model):
    """Setup PEFT (Parameter-Efficient Fine-Tuning) model."""
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        bias="none",
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)
    return model
    
def configure_model(model):
    """Configure model for parallelism."""
    if torch.cuda.device_count() > 1: 
        model.is_parallelizable = True
        model.model_parallel = True

def initialize_trainer(model, train_dataset, eval_dataset, run_name):
    """Initialize the Trainer for training."""

    training_args = TrainingArguments(
        output_dir="./" + run_name,
        gradient_checkpointing=True, # save memory
        num_train_epochs=2, # 2 epochs should be enough but good to tune further
        learning_rate=1e-5, # small learning rate for fine tuning
        bf16=True, # since we're using quantization
        optim="paged_adamw_8bit", # setting it here allows to fit in memory for fine tuning still
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        logging_steps=25,
        save_steps=100,
        eval_steps=100,
        logging_dir="./logs",
        save_strategy="steps",
        evaluation_strategy="steps",
        do_eval=True,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=training_args,
        data_collator=data_collator,
    )
    return trainer


In [None]:
print('How does model perform BEFORE fine-tuning?')

inputs = tokenizer("""[INST]Give me an email-only content calendar for activating users that haven't used my service in more than 3 months.
  The service is an online marketplace for prospective home buyers to find houses and apartments in the Netherlands.
  Instructions: for each content idea provided, give me why this is something that makes sense for activating dormant users,
  and how to adjust the content strategy depending on whether it has been successful at activating the user after each month.
  [/INST]""",
                   return_tensors="pt", return_attention_mask=False).to("cuda")
outputs = model.generate(**inputs, max_length=4096 - inputs['input_ids'].shape[1])
text = tokenizer.batch_decode(outputs)[0]
print(text)

In [14]:
model = setup_peft_model(model)
print(f'trainable params: {100 * count_trainable_parameters(model) / model.num_parameters()}')

trainable params: 1.0317767305015078


In [None]:
model.config.use_cache = False

run_name = "mistral-7b-it-emails"

trainer = initialize_trainer(model, tokenized_train_dataset, tokenized_val_dataset, run_name)
trainer.train()

In [27]:
torch.cuda.empty_cache()

Memory consumption notes 

With QLoRA: 
- Idle: 5500MB
- Inference: 6600MB
- Finetuning batch size 64: 20200MB

Without quantization: 
- Idle: 29086MiB
- Inference: 29776MiB

In [None]:
print('How does model perform AFTER fine-tuning?')
torch.cuda.empty_cache()

model = initialize_model(hf_modelname)
tokenizer = initialize_tokenizer(hf_modelname)

for i in range(100, 1000, 100):
    checkpoint_name = f"{run_name}/checkpoint-{i}"
    ft_model = PeftModel.from_pretrained(model, checkpoint_name)

    inputs = tokenizer("""[INST]Give me an email-only content calendar for activating users that haven't used my service in more than 3 months.
          The service is an online marketplace for prospective home buyers to find houses and apartments in the Netherlands.
          Instructions: for each content idea provided, give me why this is something that makes sense for activating dormant users,
          and how to adjust the content strategy depending on whether it has been successful at activating the user after each month.
          [/INST]""",
        return_tensors="pt", return_attention_mask=False).to("cuda")

    outputs = ft_model.generate(**inputs, max_length=4096 - inputs['input_ids'].shape[1])
    text = tokenizer.batch_decode(outputs)[0]
    print(checkpoint_name)
    print(text)

In [None]:
hf_modelname = "mistralai/Mistral-7B-Instruct-v0.2"
run_name = "mistral-7b-it-emails"
checkpoint_name = f"{run_name}/checkpoint-900"

model = initialize_model(hf_modelname, use_quantization=True)
tokenizer = initialize_tokenizer(hf_modelname)
ft_model = PeftModel.from_pretrained(model, checkpoint_name)


ft_model = ft_model.merge_and_unload()


In [3]:
ft_model = ft_model.merge_and_unload()



In [7]:
model.push_to_hub(run_name, use_temp_dir=False, token=access_token)
tokenizer.push_to_hub(run_name, use_temp_dir=False, token=access_token)


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/azamatomu/mistral-7b-it-emails/commit/8974209ed3eaa70421ac8f3677cfb9cf579608f8', commit_message='Upload tokenizer', commit_description='', oid='8974209ed3eaa70421ac8f3677cfb9cf579608f8', pr_url=None, pr_revision=None, pr_num=None)