### 張量看板

tensorboard --logdir=results

In [1]:
import torch

torch.cuda.empty_cache()
print(f"GPU: {torch.cuda.get_device_name(0)}")

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
import transformers

from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM


GPU: NVIDIA GeForce RTX 4090


2023-12-27 17:08:59.397548: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-27 17:08:59.416532: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-27 17:08:59.416556: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-27 17:08:59.416571: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-27 17:08:59.420828: I tensorflow/core/platform/cpu_feature_g

## Config

In [2]:
model_path = "/user_data/Llama-2-7b-chat-hf"
new_model = "llama2_trained_bf16_2048"
output_dir = "./results_test_params" # tensorboard結果
dataset_name = "gbharti/finance-alpaca"

## LoRA parameters

In [3]:
# LoRA attention dimension (Rank)
lora_r = 64

# Alpha parameter for LoRA scaling (The weight matrix is scaled by 𝑙𝑜𝑟𝑎_𝑎𝑙𝑝ℎ𝑎 / 𝑙𝑜𝑟𝑎_𝑟𝑎𝑛𝑘)
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

## bitsandbytes parameters for QLoRA

In [4]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "bfloat16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

## TrainingArguments parameters

In [15]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results_test_params"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (llama2 use bf16)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.2

# Initial learning rate (AdamW optimizer)
learning_rate = 5e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

## SFT parameters

In [6]:
# Maximum sequence length to use
max_seq_length = 2048

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0} # or "auto"

## Load llama2

In [7]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = '[PAD]' # 缺的塞這個 -> llama2 沒有 pad_token
tokenizer.padding_side = "right"

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



## Load Dataset

In [8]:
def transform_data_format(data):
    # transform data
    data["formated"] = f"[INST] <<SYS>>\nYou are a finance expert, proficient in financial issues. Please provide relevant information based on the instruction given.\n<</SYS>>\n\ninstruction:\n{data['instruction']} [/INST] {data['output']} </s>"
    return data

In [9]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.map(transform_data_format, remove_columns=["text","instruction","input","output"])

In [10]:
# TODO dataset.filter for long sequence, filt MAX_LENGTH，是 tokenize 後的長度，超過就丟掉
print("="*80)
print("Before filter")
print(dataset)
print("="*80)
dataset = dataset.filter(lambda example: len(tokenizer(example['formated'])['input_ids']) <= max_seq_length)
print("="*80)
print("After filter")
print(dataset)
print("="*80)

Before filter
Dataset({
    features: ['formated'],
    num_rows: 68912
})
After filter
Dataset({
    features: ['formated'],
    num_rows: 68895
})


In [11]:
# a = tokenizer.tokenize(dataset[3]['formated'])
print(dataset[0]['formated'])
# a

[INST] <<SYS>>
You are a finance expert, proficient in financial issues. Please provide relevant information based on the instruction given.
<</SYS>>

instruction:
For a car, what scams can be plotted with 0% financing vs rebate? [/INST] The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, plus their finance expenses they make money. Of course the money takes years to come in, or they sell your loan to another business to get the money faster but in a smaller amount. You trade in a car and they sell it at a profit. Of course that new transaction could be a lump sum or a loan on the used car... They or course make money if you bring the car back for maintenance, or you buy lots of expensive dealer options. Some dealers wa

## Inference Before Training

In [12]:
# # Run text generation pipeline with our next model
# prompt = "401k with paltry match or SPY ETF?"
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# result = pipe(f"[INST] {prompt} [/INST]")
# print(result[0]['generated_text'])

## Training

DataCollatorForCompletionOnlyLM 會把 instruction_template 到 response_template 之間的 label 設成 -100，所以就不會計算 loss

In [13]:
# DataCollatorForCompletionOnlyLM 會把 instruction_template 到 response_template 之間的 label 設成 -100，所以就不會計算 loss
instruction_template = "[INST]"
response_template = "[/INST]"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

In [16]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="formated", # 自己處理後的欄位
    data_collator=collator, # Set supervised collactor
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)





In [17]:
# Train model
trainer.train()

# Save trained model (LoRA params)
trainer.model.save_pretrained(new_model) # 儲存lora參數

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.0888
50,4.0709
75,2.2827
100,4.3648
125,2.3356
150,3.8363
175,2.2112
200,3.2161
225,1.9944
250,2.4597


## After training empty VRAM

In [19]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()
gc.collect()

NameError: name 'model' is not defined

## Reload model in FP16 and merge it with LoRA weights

In [20]:
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = '[PAD]'
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Save Full Model

In [21]:
model.save_pretrained(new_model+"-full") # 儲存完整模型
tokenizer.save_pretrained(new_model+"-full")


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


('llama2_trained_bf16_2048-full/tokenizer_config.json',
 'llama2_trained_bf16_2048-full/special_tokens_map.json',
 'llama2_trained_bf16_2048-full/tokenizer.json')

## Inference

In [23]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "401k with paltry match or SPY ETF?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2048)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

[INST] 401k with paltry match or SPY ETF? [/INST] I would go with the SPY ETF. 401k is a great way to save for retirement, but it's not the only way.  You can also save outside of your 401k.  The SPY ETF is a great way to invest in the stock market.  It's a broad index fund that tracks the S&P 500.  It's a great way to invest in the stock market without having to pick individual stocks.  It's also a great way to invest in the stock market without having to pay a lot of fees.  The SPY ETF is a low-cost index fund that tracks the S&P 500.  It's a great way to invest in the stock market without having to pay a lot of fees.  It's also a great way to invest in the stock market without having to pick individual stocks. 


## Push to hugging face

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()  # ensure token gives write access

!huggingface-cli whoami

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)