In [1]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, HfArgumentParser,
                          TrainingArguments, logging, pipeline)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm

BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64
Loading CUDA version: BNB_CUDA_VERSION=118


  warn((f'\n\n{"="*80}\n'


In [2]:
model_name = "meta-llama/Llama-2-13b-chat-hf"
dataset_name = "vibhorag101/phr_mental_health_dataset"
new_model = "llama-2-13b-chat-hf-phr_mental_therapy"

# Hyperparameters
num_train_epochs = 2
per_device_train_batch_size = 2
per_device_eval_batch_size = 2
gradient_accumulation_steps = 1
max_seq_length = 4096

In [3]:
# QLoRA parameters and bits and bytes
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
fp16 = False
bf16 = True

output_dir = "./results"

gradient_checkpointing = False
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25
packing = False
device_map = {"": 0}
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [None]:
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle().select(range(1000))
dataset.train_test_split(test_size=0.2)

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb",
    resume_from_checkpoint=True
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# # Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,use_fast=False,add_eos_token=True)
tokenizer.pad_token_id = 18610

In [None]:
#Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()
trainer.model.save_pretrained(new_model)

In [5]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu", ## device map = "cpu", for merging on cpu
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,use_fast=False,add_eos_token=True)
tokenizer.pad_token_id = 18610

### Old version tokeniser, can't generate eos token
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"



Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  5.49it/s]


In [None]:
## for inference without merging qlora weights with original
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model = PeftModel.from_pretrained(base_model, new_model)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,use_fast=False,add_eos_token=True)
tokenizer.pad_token_id = 18610

In [6]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]
[A

pytorch_model-00003-of-00003.bin:   0%|          | 16.4k/6.18G [00:01<133:49:37, 12.8kB/s]

pytorch_model-00003-of-00003.bin:   0%|          | 98.3k/6.18G [00:01<20:30:41, 83.7kB/s] 

pytorch_model-00003-of-00003.bin:   0%|          | 197k/6.18G [00:01<10:29:43, 164kB/s]  

pytorch_model-00003-of-00003.bin:   0%|          | 393k/6.18G [00:01<5:04:29, 338kB/s] 

[A[A

pytorch_model-00003-of-00003.bin:   0%|          | 573k/6.18G [00:02<4:40:06, 368kB/s]

pytorch_model-00003-of-00003.bin:   0%|          | 836k/6.18G [00:02<3:48:02, 452kB/s]

pytorch_model-00003-of-00003.bin:   0%|          | 1.10M/6.18G [00:03<3:25:13, 502kB/s]

pytorch_model-00003-of-00003.bin:   0%|          | 1.41M/6.18G [00:03<3:00:45, 570kB/s]

pytorch_model-00003-of-00003.bin:   0%|          | 1.74M/6.18G [00:04<2:44:29, 626kB/s]

pytorch_model-00003-of-00003.bin:   0%|          | 2.06M/6.18G [00:04<2:35:09, 664kB/s]

pytorch_model

CommitInfo(commit_url='https://huggingface.co/vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy/commit/f70597f2874d1671d91b05bf55ad54ae97bf1339', commit_message='Upload tokenizer', commit_description='', oid='f70597f2874d1671d91b05bf55ad54ae97bf1339', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
logging.set_verbosity(logging.CRITICAL)
SYSTEM_PROMPT = "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.If you don't know the answer to a question, please don't share false information. Always try to be as cheerfull as possible"
# Run text generation pipeline with our next model
prompt = "I am feeling suicidal.I have lost a lot of money in gambling. The money I used was taken as a loan"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000,temperature=0.9)
result = pipe(f"<s>[INST]<<SYS>>{SYSTEM_PROMPT}<</SYS>> {prompt} [/INST]")
print(result[0]['generated_text'])