In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_from_disk, concatenate_datasets
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments

In [None]:
TOKEN = 'hf_'

In [None]:
# Load model

base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                             quantization_config=bnb_config)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=4096,
    padding_side="left",
    add_eos_token=True)

# tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.unk_token
model.config.pad_token_id = tokenizer.pad_token_id


In [2]:
# Data preparation
from datasets import Dataset

def multiply(a, b):
    return a * b

def generate_dataset(start, end, function):
     return [(i, j, function(i, j)) for i in range(start, end + 1) for j in range(start, end + 1)]

PROMPT = "[INST] Return only result witn no explanation: {inst} [/INST] = {ans}"

train_dataset = generate_dataset(60, 100, multiply)
train_dataset = {
    "text": [
        PROMPT.format(inst=f'{a} * {b}', ans=c) for a, b, c in train_dataset
    ],
}
dataset = Dataset.from_dict(train_dataset)


dataset = dataset.shuffle()

In [3]:
# Prepare for training
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
config = LoraConfig(
    r=128,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

if torch.cuda.device_count() > 1:  # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True
    print('### multiple GPU ###')

trainable params: 340164608 || all params: 4092235776 || trainable%: 8.312439131561906


In [4]:
# Training
output_dir = "./calc_mistral"
tokenizer.pad_token = tokenizer.eos_token

response_template = '[/INST] ='
response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=32,
    dataset_num_proc=2,
    packing=False,
    data_collator=collator,
    args=TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        warmup_steps=5,
        num_train_epochs=3,
        #max_steps=20,
        learning_rate=2e-5,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to='none',  # or log to WanDB
        logging_dir="./logs",
    ),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Map (num_proc=2):   0%|          | 0/1681 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,0.8079
20,0.3852
30,0.3196
40,0.2535
50,0.2398
60,0.1879
70,0.182


TrainOutput(global_step=78, training_loss=0.3207863783225035, metrics={'train_runtime': 426.8437, 'train_samples_per_second': 11.815, 'train_steps_per_second': 0.183, 'total_flos': 6970663756775424.0, 'train_loss': 0.3207863783225035, 'epoch': 2.96})

In [None]:
# 4-bit version 
# model = model.merge_and_unload()
# model.push_to_hub("adriata/calc_mistral", 
#                   use_temp_dir=False, 
#                   token=TOKEN)
# tokenizer.push_to_hub("adriata/calc_mistral", 
#                       use_temp_dir=False, 
#                       token=TOKEN)

In [5]:
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
new_model = "calc_mistral"
new_model_id = "adriata/calc_mistral_v2"

model = AutoModelForCausalLM.from_pretrained(base_model_id)
model_merge = PeftModel.from_pretrained(model, new_model)
model_merge = model_merge.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id)

In [None]:
model_merge.push_to_hub("adriata/calc_mistral", 
                  use_temp_dir=False, 
                  token=TOKEN)
tokenizer.push_to_hub("adriata/calc_mistral_v2", 
                      use_temp_dir=False, 
                      token=TOKEN)