## Steps to fine tune model
- Select and load model
- Select and preprocess dataset (train/eval split, tokenize)
- Define quantization and LoRA adaptation before fine-tuning for efficiency
- Tune and evaluate model





In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

from datasets import load_dataset
from peft import PeftModel
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

def initialize_model(hf_modelname, use_quantization=True):
    """Initialize the language model with quantization configuration for QLoRA."""
    if use_quantization:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4", # sets the data type for tuned parameters
            bnb_4bit_compute_dtype=torch.bfloat16 # sets the data type for all parameters
        )
    else: 
        bnb_config = None

    model = AutoModelForCausalLM.from_pretrained(hf_modelname, device_map="auto",
                                                 quantization_config=bnb_config)
    return model

def initialize_tokenizer(hf_modelname):
    """Initialize the tokenizer."""

    tokenizer = AutoTokenizer.from_pretrained(
        hf_modelname,
        padding_side="left",
        add_eos_token=True,
        add_bos_token=True,
        use_fast=False # when True this one uses a Rust-based tokenizer
    )
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def count_trainable_parameters(model):
    """Count the number of trainable parameters in the model."""
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params


In [2]:
hf_modelname = "mistralai/Mistral-7B-Instruct-v0.2"

model = initialize_model(hf_modelname, use_quantization=True)
tokenizer = initialize_tokenizer(hf_modelname)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
print('GPU:', torch.cuda.is_available())

GPU: True


In [4]:
print('Total parameters:', model.num_parameters())
print('Trainable parameters before LoRA:', count_trainable_parameters(model))

Total parameters: 7241732096
Trainable parameters before LoRA: 262410240


In [5]:
def prepare_datasets(tokenizer, file_path):
    """Load and tokenize the training dataset."""
    dataset = load_dataset(path='text', data_files=file_path, split='train')
    dataset = dataset.train_test_split(train_size=0.9)

    # Shuffle the training dataset
    dataset['train'] = dataset['train'].shuffle()

    generate_and_tokenize_prompt = lambda prompt: tokenizer(prompt['text'])

    tokenized_train_dataset = dataset['train'].map(generate_and_tokenize_prompt)
    tokenized_val_dataset = dataset['test'].map(generate_and_tokenize_prompt)

    return tokenized_train_dataset, tokenized_val_dataset


def max_input_lengths(tokenized_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset] + \
              [len(x['input_ids']) for x in tokenized_val_dataset]

    return max(lengths)

In [6]:
tokenized_train_dataset, tokenized_val_dataset = prepare_datasets(tokenizer, '../data/instruction-emails.txt')
print('Length of longest input', max_input_lengths(tokenized_train_dataset, tokenized_val_dataset))

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Length of longest input 1601


In [7]:
len(tokenized_train_dataset), len(tokenized_val_dataset)

(172, 20)

In [8]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
   

In [11]:
def setup_peft_model(model):
    """Setup PEFT (Parameter-Efficient Fine-Tuning) model."""
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
        bias="none",
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)
    return model
    
def configure_model(model):
    """Configure model for parallelism."""
    if torch.cuda.device_count() > 1: 
        model.is_parallelizable = True
        model.model_parallel = True

def initialize_trainer(model, train_dataset, eval_dataset, run_name):
    """Initialize the Trainer for training."""

    training_args = TrainingArguments(
        output_dir="../model/" + run_name,
        gradient_checkpointing=True, # save memory
        num_train_epochs=5, # 2 epochs should be enough but good to tune further
        learning_rate=2e-5, # small learning rate for fine tuning
        bf16=True, # since we're using quantization
        optim="paged_adamw_8bit", # setting it here allows to fit in memory for fine tuning still
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_steps=25,
        save_steps=25,
        eval_steps=25,
        logging_dir="./logs",
        save_strategy="steps",
        evaluation_strategy="steps",
        do_eval=True,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=training_args,
        data_collator=data_collator,
    )
    return trainer


In [8]:
print('How does model perform BEFORE fine-tuning?')

inputs = tokenizer("""[INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]""",
                   return_tensors="pt", return_attention_mask=False).to("cuda")
outputs = model.generate(**inputs, max_length=4096 - inputs['input_ids'].shape[1])
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


How does model perform BEFORE fine-tuning?
<s> [INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]</s> 1. **Week 1:** [Welcome Email]
Subject: Welcome to [Company Name] - Your New Home for Apartment Shopping

- Introduce the company and its mission
- Highlight the benefits of using the platform
- Invite the user to explore the website and start their apartment search

2. **Week 2:** [New Listings Email]
Subject: Fresh Apartment Listings Just Added

- Showcase new listings that have been added to the platform
- Include high-quality images and brief descriptions
- Encourage users to take advantage of these new options

3. **Week 3:** [Filtering Tips Email]
Subject: Tips for Using Our Advanced Filtering System

- Share tips on how to use the filtering system effectively
- Highlight the various filters available (location, price ra

In [10]:
model = setup_peft_model(model)
print(f'trainable params: {100 * count_trainable_parameters(model) / model.num_parameters()}')

trainable params: 1.0473970386995128


In [12]:
model.config.use_cache = False

run_name = "mistral-7b-it-gpt4-emails"
trainer = initialize_trainer(model, tokenized_train_dataset, tokenized_val_dataset, run_name)
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
25,1.8565,1.695013
50,1.5623,1.621202
75,1.4683,1.604276
100,1.3914,1.60553




TrainOutput(global_step=110, training_loss=1.5524754090742632, metrics={'train_runtime': 1028.0165, 'train_samples_per_second': 0.837, 'train_steps_per_second': 0.107, 'total_flos': 4.333380286758912e+16, 'train_loss': 1.5524754090742632, 'epoch': 5.0})

In [27]:
torch.cuda.empty_cache()

Memory consumption notes 

With QLoRA: 
- Idle: 5500MB
- Inference: 6600MB
- Finetuning batch size 64: 20200MB

Without quantization: 
- Idle: 29086MiB
- Inference: 29776MiB

In [3]:
print('How does model perform AFTER fine-tuning?')
torch.cuda.empty_cache()

hf_modelname = "mistralai/Mistral-7B-Instruct-v0.2"
run_name = "mistral-7b-it-gpt4-emails"
model = initialize_model(hf_modelname)
tokenizer = initialize_tokenizer(hf_modelname)

for i in range(25, 125, 25):
    checkpoint_name = f"{run_name}/checkpoint-{i}"
    ft_model = PeftModel.from_pretrained(model, checkpoint_name)

    inputs = tokenizer("""[INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]""",
        return_tensors="pt", return_attention_mask=False).to("cuda")

    outputs = ft_model.generate(**inputs, max_length=4096 - inputs['input_ids'].shape[1])
    text = tokenizer.batch_decode(outputs)[0]
    print(checkpoint_name)
    print(text)

How does model perform AFTER fine-tuning?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


mistral-7b-it-gpt4-emails/checkpoint-25
<s> [INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]</s>1. **Week 1-2: Introduction to the Marketplace**
    - Email 1: Welcome to the new marketplace for buying apartments.
    - Email 2: Introduce the benefits of using the marketplace for apartment purchases.
    - Email 3: Share success stories of previous buyers.
    - Email 4: Highlight the ease of use and convenience of the platform.

2. **Week 3-4: Showcasing the Marketplace**
    - Email 5: Showcase the variety of apartments available on the marketplace.
    - Email 6: Introduce the filtering and sorting options to help users find their perfect apartment.
    - Email 7: Share testimonials from satisfied buyers.

3. **Week 5-6: Special Offers and Discounts**
    - Email 8: Announce a special offer or discount for new users.
    

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


mistral-7b-it-gpt4-emails/checkpoint-50
<s> [INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]</s>  Proposed Email Calendar Plan  - Activation Campaign for Web Users -  Based on the provided goal, customer segment, and company description, the proposed email calendar plan is as follows:  - **Month 1: Introduction to the Marketplace**  - Week 1: Welcome Email - Introduce the marketplace and its unique selling points. - Week 2: Feature Highlight - Showcase the ease of finding apartments with the marketplace. - Week 3: Testimonials - Share customer success stories. - Week 4: Special Offer - Offer a discount or special deal to new users.  - **Month 2: Engagement and Retention**  - Week 1: Personalized Recommendations - Send personalized apartment recommendations based on user preferences. - Week 2: Marketplace Updates - Inform use

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


mistral-7b-it-gpt4-emails/checkpoint-75
<s> [INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]</s>  Proposed Email Calendar Plan:  Based on the goal of activation, the email calendar will be designed to engage new web users with a series of informative and promotional emails aimed at showcasing the benefits of using the apartment marketplace platform. The plan is as follows:  Month 1:  - Week 1: Welcome Email - Introduce new users to the platform, highlighting the ease of finding and purchasing apartments.  - Week 2: Featured Listings - Showcase a selection of popular or unique listings to entice users to explore the marketplace.  - Week 3: How It Works - Provide a step-by-step guide on how to use the platform, making it easy for new users to navigate and find what they're looking for.  - Week 4: Testimonials - Share positive 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


mistral-7b-it-gpt4-emails/checkpoint-100
<s> [INST]Produce an email calendar. 
            Goal: Activation. 
            Duration: 6 months. 
            Customer segment: Web users.
            Company description: Marketplace for buying apartments[/INST]</s>�  Email Calendar Plan: Activation Campaign for Web Users (6 Months) ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [2]:
hf_modelname = "mistralai/Mistral-7B-Instruct-v0.2"
run_name = "mistral-7b-it-gpt4-emails"
checkpoint_name = f"{run_name}/checkpoint-75"

model = initialize_model(hf_modelname, use_quantization=True)
tokenizer = initialize_tokenizer(hf_modelname)
ft_model = PeftModel.from_pretrained(model, checkpoint_name)


ft_model = ft_model.merge_and_unload()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [5]:
access_token = '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'
ft_model.push_to_hub(run_name, use_temp_dir=False, token=access_token)
tokenizer.push_to_hub(run_name, use_temp_dir=False, token=access_token)


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/azamatomu/mistral-7b-it-gpt4-emails/commit/8f23f52d19ab0bc055057c6120256b0f37e67c70', commit_message='Upload tokenizer', commit_description='', oid='8f23f52d19ab0bc055057c6120256b0f37e67c70', pr_url=None, pr_revision=None, pr_num=None)