https://medium.com/@mohitdulani/fine-tune-any-llm-using-your-custom-dataset-f5e712eb6836 template by mohit dulani

In [1]:
!pip install -q torch dataset peft trl transformers bitsandbytes

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.53-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting datasets (from trl)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting Mako (from alembic>=0.6.2->dataset)
  Downlo

In [2]:
import torch
from datasets import load_dataset
from google.colab import userdata, drive
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

In [6]:
access_token = userdata.get("ACCESS_TOKEN")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
drive.mount('/content/drive')
%cd /content/drive/MyDrive/projects/finance-llm

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/projects/finance-llm


In [14]:
model_name = "mistralai/Mistral-7B-V0.3"

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [9]:
peft_config = LoraConfig(
    r=16, lora_alpha=64, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    max_grad_norm=0.35,
    warmup_ratio=0.03,
    max_steps=100,
    save_steps=100,
    lr_scheduler_type="constant",
)

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token=access_token
)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [27]:
dataset = load_dataset('json', data_files='output_formatted.jsonl', field='data', split='train')
dataset = dataset.train_test_split(test_size=0.1)

In [28]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 1207
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 135
    })
})


In [29]:
def createPrompt(example):
    bos_token = "<s>"
    system_prompt = "[INST] You are a finance suggestion model and your role is to give finance related suggestions \n"
    input_prompt = f" {example['input']} [/INST]"  # depends on the dataset
    output_prompt = f"{example['output']} </s>"

    return [bos_token + system_prompt + input_prompt + output_prompt]

In [30]:
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    formatting_func=createPrompt,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1207 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [31]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [32]:
trainer.train()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
10,0.7703
20,0.1676
30,0.0302
40,0.0214
50,0.0195
60,0.0179
70,0.0157
80,0.0148
90,0.0149
100,0.014



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-V0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.3 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-V0.3.


TrainOutput(global_step=100, training_loss=0.10863254189491273, metrics={'train_runtime': 325.5243, 'train_samples_per_second': 1.229, 'train_steps_per_second': 0.307, 'total_flos': 4374909891379200.0, 'train_loss': 0.10863254189491273, 'epoch': 100.0})

In [33]:
save_model = trainer.model.module if hasattr(trainer.model, "module") else trainer.model
save_model.save_pretrained("./results/model")


Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-V0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.3 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-V0.3.


In [34]:
lora_config = LoraConfig.from_pretrained("./results/model")
model = get_peft_model(model, lora_config)

In [None]:
text = input(">>> ")
text = f"Question: {text}"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
inputs = tokenizer(text, return_tensors='pt').to(device)
outputs = model.generate(**inputs, max_new_tokens=512)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

>>> how to invest in mutual funds


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
