In [None]:
from google.colab import drive

drive.mount('/content/drive/')

In [None]:
!pip3 install tokenizers sentencepiece #wandb

In [None]:
# !pip install transformers
!pip install git+https://github.com/llohann-speranca/transformers.git@fix-resume-checkpoint-for-peftmodel

In [None]:
!pip3 install huggingface-hub

In [None]:
!pip3 install datasets peft trl

In [None]:
!pip3 install bertviz

In [None]:
!pip install accelerate

In [None]:
!pip install bitsandbytes einops

In [None]:
import os
os.chdir("drive/")
os.chdir('My Drive')
os.chdir('Experiment')
os.chdir('LLMs')

In [None]:
OUTPUT_DIR = './llama2-outputs/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from peft import LoraConfig
from peft import PeftModel, PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft.tuners.lora import LoraLayer

from trl import SFTTrainer

In [None]:
########################################################################
# This is a fully working simple example to use trl's RewardTrainer.
#
# This example fine-tunes any causal language model (GPT-2, GPT-Neo, etc.)
# by using the RewardTrainer from trl, we will leverage PEFT library to finetune
# adapters on the model.
#
########################################################################

## Configs and Arguments

In [None]:
class Arguments:
    local_rank:int = -1
    per_device_train_batch_size = 8
    per_device_eval_batch_size = 1

    learning_rate = 2e-4
    max_grad_norm = 0.3
    weight_decay = 0.001

    lora_alpha = 16
    lora_dropout = 0.1
    lora_r = 64
    max_seq_length = 1024 #768 #512

    model_name = "NousResearch/Llama-2-7b-hf" # 'meta-llama/Llama-2-7b-chat-hf', 'NousResearch/Llama-2-13b-hf'
    dataset_name = "databricks/databricks-dolly-15k" # "mlabonne/guanaco-llama2-1k", "databricks/databricks-dolly-15k"

    new_model = "Llama-2-7b-dolly"

    use_4bit = True
    use_nested_quant = False
    bnb_4bit_compute_dtype = "float16"
    bnb_4bit_quant_type = "nf4"

    num_train_epochs = 1

    fp16 = False
    bf16 = True

    gradient_accumulation_steps = 1
    packing = False
    gradient_checkpointing = True
    optim = "paged_adamw_32bit"
    lr_scheduler_type = "constant" # Constant a bit better than cosine, and has advantage for analysis

    max_steps: int = 10000
    warmup_ratio: float = 0.03
    group_by_length: bool = True # "Group sequences into batches with same length. Saves memory and speeds up training considerably."

    save_steps: int = 100
    logging_steps: int = 100


script_args = Arguments()

In [None]:
# Load the entire model on the GPU 0
device_map = {"": 0}


## Model

In [None]:
from torch.nn import functional as F


# <https://github.com/huggingface/trl/issues/870>
# <https://github.com/neelsjain/NEFTune#code>
def NEFTune(model, noise_alpha=5)
    def noised_embed(orig_embed, noise_alpha):
        def new_func(x):
            # during training, we add noise to the embedding
            # during generation, we don't add noise to the embedding
            if model.training:
                embed_init = orig_embed(x)
                dims = torch.tensor(embed_init.size(1) * embed_init.size(2))
                mag_norm = noise_alpha/torch.sqrt(dims)
                return embed_init + torch.zeros_like(embed_init).uniform_(-mag_norm, mag_norm)
            else:
                return orig_embed(x)
        return new_func
    ##### NOTE: this is for a LLaMA model #####
    ##### For a different model, you need to change the attribute path to the embedding #####
    model.base_model.model.model.embed_tokens.forward = noised_embed(model.base_model.model.model.embed_tokens, noise_alpha)
    return model

In [None]:
def create_and_prepare_model(args):
    compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=args.use_4bit,
        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=args.use_nested_quant,
    )

    if compute_dtype == torch.float16 and args.use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)

    device_map = {"": 0}

    model = AutoModelForCausalLM.from_pretrained(
        args.model_name, quantization_config=bnb_config, device_map=device_map, trust_remote_code=True
    )

    peft_config = LoraConfig(
        lora_alpha=script_args.lora_alpha,
        lora_dropout=script_args.lora_dropout,
        r=script_args.lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            'q_proj',
            'k_proj',
            'v_proj',
            'o_proj',
            'gate_proj',
            'up_proj',
            'down_proj',
        ]
        # target_modules=[
        #     "query_key_value",
        #     "dense",
        #     "dense_h_to_4h",
        #     "dense_4h_to_h",
        # ],  # , "word_embeddings", "lm_head"],
    )

    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    if args.gradient_checkpointing:
        model.gradient_checkpointing_enable()

    return model, peft_config, tokenizer


In [None]:
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optim=script_args.optim,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    learning_rate=script_args.learning_rate,
    fp16=script_args.fp16,
    bf16=script_args.bf16,
    max_grad_norm=script_args.max_grad_norm,
    max_steps=script_args.max_steps,
    warmup_ratio=script_args.warmup_ratio,
    group_by_length=script_args.group_by_length,
    lr_scheduler_type=script_args.lr_scheduler_type,
)


model, peft_config, tokenizer = create_and_prepare_model(script_args)
model.config.use_cache = False
dataset = load_dataset(script_args.dataset_name, split="train")

## Training

In [None]:
model

In [None]:
dataset[0]

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        instruction = example['instruction'][i]
        input_context = example['context'][i]
        response = example['response'][i]

        text = PROMPT_WITH_INPUT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction
{0}

Input:
{1}

### Response:
{2}

### End""".format(instruction, input_context, response)
        # text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset,
#     peft_config=peft_config,
#     dataset_text_field="text",
#     max_seq_length=script_args.max_seq_length,
#     tokenizer=tokenizer,
#     args=training_arguments,
#     packing=script_args.packing,
# )

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    # dataset_text_field="text",
    formatting_func=formatting_prompts_func,
    max_seq_length=script_args.max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=script_args.packing,
)

In [None]:
for name, module in trainer.model.named_modules():
    if isinstance(module, LoraLayer):
        if script_args.bf16:
            module = module.to(torch.bfloat16)
    if "norm" in name:
        module = module.to(torch.float32)
    if "lm_head" in name or "embed_tokens" in name:
        if hasattr(module, "weight"):
            if script_args.bf16 and module.weight.dtype == torch.float32:
                module = module.to(torch.bfloat16)

In [None]:
trainer.train()