code inspired from https://github.com/NVIDIA/workbench-example-mistral-finetune/blob/main/code/mistral-finetune.ipynb

In [None]:
# DEFINE QUANTIZATION HERE. Choose from ("none" | "8bit" | "4bit")
QUANTIZATION = "4bit"

In [None]:
import os

# GPU 번호 설정
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

In [None]:
dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split='train')

dataset = dataset.train_test_split(test_size=0.2)
val_test_dataset = dataset['test'].train_test_split(test_size=0.5)

# for training time, we only use the random 1000 examples
train_dataset = dataset["train"].select(range(1000))
eval_dataset = val_test_dataset["train"].select(range(1000))
test_dataset = val_test_dataset["test"].select(range(1000))


In [None]:
# Pre-define quantization configs

################## 4bit ##################
bb_config_4b = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
##########################################

################## 8bit ##################
bb_config_8b = BitsAndBytesConfig(
    load_in_8bit=True,
)
##########################################

def quantization_config(quantization):
    if quantization == "8bit":
        return bb_config_8b
    else:
        return bb_config_4b

In [None]:
model_id = "mistralai/Mistral-7B-v0.1"
refined_model = "mistralai-enhanced"


if QUANTIZATION == "none":
    model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda")
else: 
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config(QUANTIZATION), device_map={"": 0})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

def tokenize(prompt):
    tokenized = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [None]:
def process_prompt(data):
    new_prompt = f"""<s>[INST] {data["instruction"]} here are the inputs {data["input"]} [/INST] \\n {data["output"]} </s>"""
    return tokenize(new_prompt)

tokenized_train_ds = train_dataset.map(process_prompt)

In [None]:
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

def print_param_info(model):
    """
    Outputs trainable parameter information.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print(model)


In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_param_info(model)

In [None]:
# Parallelization is possible if system is multi-GPU
#if torch.cuda.device_count() > 1: 
#    model.is_parallelizable = True
#    model.model_parallel = True

model.is_parallelizable = False
model.model_parallel = False

tokenizer.pad_token = tokenizer.eos_token

# Training configs
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_ds,
    args=transformers.TrainingArguments(
        output_dir="./mistral-code-instruct",
        num_train_epochs=1,
        #warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=-1,
        learning_rate=2.5e-5,
        logging_steps=50,
        bf16=True if (QUANTIZATION != "8bit") else False,
        fp16=True if (QUANTIZATION == "8bit") else False,
        optim="paged_adamw_8bit",
        logging_dir="./logs",
        save_strategy="steps",
        save_steps=50,
        report_to="none",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Silencing warnings. If using for inference, consider re-enabling.
model.config.use_cache = False 

import time

starttime = time.time()
# Train! 
trainer.train()

print("Training time: ", time.time()-starttime)
trainer.save_model(refined_model)